Compile EPA emissions data

Convert the data from hourly to monthly and export all years as a single file.

Instructions

Choose the file format (csv or feather) that was used when saving hourly EPA data. Feather is much faster for read/write.


In [ ]:
import pandas as pd
import numpy as np
import os
from os.path import join
from joblib import Parallel, delayed
import sys

cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')

In [ ]:
file_format = 'csv'
# file_format = 'feather'

In [ ]:
%load_ext watermark
%watermark -iv -v

In [ ]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [ ]:
%aimport Data.data_extraction
from Data.data_extraction import import_group_epa, unit_conversion

%aimport Analysis.index
from Analysis.index import add_datetime, add_quarter

Change years if necessary


In [ ]:
start_year = 2001
end_year = 2017

if __name__ == '__main__':
    base_path = join(data_path, 'EPA emissions')
    paths = [join(base_path, 'EPA emissions {}.{}'.format(str(year), file_format))
             for year in range(start_year, end_year + 1)]
    
    df_list = Parallel(n_jobs=-1)(delayed(import_group_epa)(path) 
                                  for path in paths)

In [ ]:
df = pd.concat(df_list)

In [ ]:
path = os.path.join(data_path, 'Derived data',
                    'Monthly EPA emissions 2018-03-06.csv')
df.to_csv(path, index=False)

In [ ]: